library(bigMap)
# load aux. stuff
source('./mcsk15.R')
# first 50 principal components
X <- as.matrix(read.csv('./mcsk15_data.csv.gz'))
# ./mcsk15/start.R
library(bigMap)
X <- as.matrix(read.csv('./mcsk15_data.csv.gz'))
threads <- 40
ppx.list <- round(nrow(X) * c(.005, .01, .05, .10, .20, .30, .40, .50), 0)
# +++ start MPI cluster
mpi.cl <- bdm.mpi.start(threads)
if (is.null(mpi.cl)) return()
# +++ run
m.list <- lapply(ppx.list, function(ppx)
{
# +++ compute betas
m <- bdm.init(X, dSet.name = 'mck15', ppx = ppx, threads = threads, mpi.cl = mpi.cl)
# +++ ptSNE
m <- bdm.ptsne(NULL, m, lRate = NULL, theta = 0.0, threads = threads, mpi.cl = mpi.cl, layers = 2)
# +++ EFR
m.efr <- bdm.efr(NULL, list(m), ppx = ppx, iters = 100, threads = threads, mpi.cl = mpi.cl)
# +++ EFR (ppx = 45)
m.efr <- bdm.efr(NULL, list(m), ppx = 45, iters = 100, threads = threads, mpi.cl = mpi.cl)
# +++ kNP
m.efr <- lapply(m.efr, function(m) bdm.knp(NULL, m, threads = threads, mpi.cl = mpi.cl))
# +++ hlC
m.efr <- lapply(m.efr, function(m) bdm.hlCorr(NULL, m, threads = threads, mpi.cl = mpi.cl))
#
m.efr
})
save(m.list, file = './mcsk15_list.RData')
# +++ stop cluster
stopCluster(mpi.cl)
Submit job:
$ qsub -pe make 20 -l h_vmem=4G Rsckt ./mcsk15/start.R
# load ouput
load('./mcsk15_list.RData')
# pt-SNE embedding
m.list1 <- lapply(m.list, function(m.ppx) m.ppx[[1]])
# pt-SNE+EFC.ppx
m.list2 <- lapply(m.list, function(m.ppx) m.ppx[[2]])
# pt-SNE+EFC.45
m.list3 <- lapply(m.list, function(m.ppx) m.ppx[[3]])
sapply(m.list1, function(m) m$ppx$ppx)
## [1] 224 448 2240 4481 8962 13442 17923 22404
nulL <- lapply(m.list1, function(m) bdm.cost(m))
mcsk15.legend()
# labels
L <- mcsk15.lbls(l = 1)
nulL <- lapply(m.list1, function(m) {
m$lbls <- L
bdm.ptsne.plot(m, class.pltt = MACOSKO_COLORS1, ptsne.cex = 0.3)
})
nulL <- lapply(m.list2, function(m) {
m$lbls <- L
bdm.ptsne.plot(m, class.pltt = MACOSKO_COLORS1, ptsne.cex = 0.3)
})
nulL <- lapply(m.list3, function(m) {
m$lbls <- L
bdm.ptsne.plot(m, class.pltt = MACOSKO_COLORS1, ptsne.cex = 0.3)
})
hlTable <- sapply(m.list1, function(m) summary(m$hlC)[4])
hlTable <- matrix(hlTable, nrow = 1)
colnames(hlTable) <- sapply(m.list1, function(m) m$ppx$ppx)
rownames(hlTable) <- c('<hlC>')
knitr::kable(hlTable, caption = 'hl-Correlation') %>%
kable_styling(full_width = F)
| 224 | 448 | 2240 | 4481 | 8962 | 13442 | 17923 | 22404 | |
|---|---|---|---|---|---|---|---|---|
| <hlC> | 0.1328081 | 0.0965539 | 0.1739388 | 0.1886429 | 0.1988989 | 0.314965 | 0.9076846 | 0.8988457 |
Note the HL-Correlation (~90%) for high perplexities (40%, 50% of data set size) and the similarity between the embedding and the PCA 2 first components plot;
# PCA plot
plot(X[, 1], X[, 2], pch = 15, cex = 0.3, col = MACOSKO_COLORS1[L])
# pt-SNE (ppx=17923)
m <- m.list1[[7]]
m$lbls <- L
bdm.ptsne.plot(m, class.pltt = MACOSKO_COLORS1, ptsne.cex = 0.3)
hlTable <- sapply(m.list2, function(m) summary(m$hlC)[4])
hlTable <- matrix(hlTable, nrow = 1)
colnames(hlTable) <- sapply(1:8, function(i) paste(m.list1[[i]]$ppx$ppx, '+', m.list2[[i]]$ppx$ppx))
rownames(hlTable) <- c('<hlC>')
knitr::kable(hlTable, caption = 'hl-Correlation') %>%
kable_styling(full_width = F)
| 224 + 224 | 448 + 448 | 2240 + 2240 | 4481 + 4481 | 8962 + 8962 | 13442 + 13442 | 17923 + 17923 | 22404 + 22404 | |
|---|---|---|---|---|---|---|---|---|
| <hlC> | 0.1325114 | 0.0972726 | 0.175903 | 0.1946172 | 0.2044327 | 0.3113044 | 0.9063941 | 0.8983952 |
hlTable <- sapply(m.list3, function(m) mean(m$hlC))
hlTable <- matrix(round(hlTable, 4), nrow = 1)
colnames(hlTable) <- sapply(1:8, function(i) paste(m.list1[[i]]$ppx$ppx, '+', m.list3[[i]]$ppx$ppx))
rownames(hlTable) <- c('<hlC>')
knitr::kable(hlTable, caption = 'hl-Correlation') %>%
kable_styling(full_width = F)
| 224 + 45 | 448 + 45 | 2240 + 45 | 4481 + 45 | 8962 + 45 | 13442 + 45 | 17923 + 45 | 22404 + 45 | |
|---|---|---|---|---|---|---|---|---|
| <hlC> | 0.1343 | 0.0978 | 0.1752 | 0.1907 | 0.1858 | 0.2115 | 0.1852 | 0.1905 |
bdm.knp.plot(m.list1)
bdm.knp.plot(m.list2)
bdm.knp.plot(m.list3)
nulL <- lapply(1:8, function(i) {
bdm.knp.plot(list(m.list1[[i]], m.list2[[i]]))
})
nulL <- lapply(1:8, function(i) {
bdm.knp.plot(list(m.list1[[i]], m.list3[[i]]))
})
rTimes <- sapply(seq_along(m.list), function(i) {
m1 <- m.list1[[i]]
m3 <- m.list3[[i]]
c(m1$ppx$t[[3]], m1$t$epoch, m1$t$ptsne[[3]], sum(m1$ppx$t[[3]]+m1$t$ptsne[[3]]), m3$t$efr[[3]])
})
rTimes <- round(rTimes /60, 2)
colnames(rTimes) <- sapply(m.list1, function(m) m$ppx$ppx)
rownames(rTimes) <- c('betas', 'epoch', 'pt-SNE', 'total', 'EFR.45')
knitr::kable(rTimes, caption = 'Computation times (min)') %>%
kable_styling(full_width = F)
| 224 | 448 | 2240 | 4481 | 8962 | 13442 | 17923 | 22404 | |
|---|---|---|---|---|---|---|---|---|
| betas | 0.31 | 0.35 | 0.50 | 0.70 | 1.12 | 1.09 | 0.63 | 0.59 |
| epoch | 0.28 | 0.26 | 0.26 | 0.27 | 0.28 | 0.27 | 0.27 | 0.27 |
| pt-SNE | 20.39 | 18.99 | 16.51 | 15.58 | 14.67 | 13.43 | 13.59 | 13.54 |
| total | 20.71 | 19.34 | 17.01 | 16.27 | 15.79 | 14.52 | 14.22 | 14.13 |
| EFR.45 | 4.77 | 4.79 | 4.80 | 4.80 | 4.80 | 4.79 | 4.78 | 4.80 |
pt-SNE run on: Intel(R) Xeon(R) CPU E5-2650 v3 2.30GHz, 32Mb cache, 41 cores, 4GB/core RAM.
EFR run on: Intel(R) Xeon(R) CPU E5-2650 v3 2.30GHz, 32Mb cache, 20 cores, 4GB/core RAM.